// femtorv32, a minimalistic RISC-V RV32I core
//       Bruno Levy, 2020-2021
//
// This file: FGA: Femto Graphics Adapter
//   Note: VRAM is write-only ! (the read port is used by HDMI)
//
// sel_cntl / io_wstrb / io_rstrb gives access to the set of control 
//                                registers and commands:
//
// Write: set register:     value[31:8]                       REG_XXX[7:0]
//        command (1 arg):  arg24[31:8]                  1[7] CMD_XXX[6:0]
//        command (2 args): arg12_1[31:20] arg12_2[19:8] 1[7] CMD_XXX[6:0]
//
// Read:  the value of the register indicated by REG_READREGID
//
// Registers:
// REG_STATUS  (0): vblank[31] hblank[30] drawarea[29] membusy[28] XXXX[27:24] Y[23:12] X[11:0]  
// RESOLUTION  (1): height[23:12] width[11:0]
// COLORMODE   (2): colormapped[3] bpp[2:0] (0:1bpp 1:2bpp 2:4bpp 3:8bpp 4:16bpp)
// DISPLAYMODE (3): magnify[0]
// ORIGIN      (4): origin_pixel_address[23:0] (first scanline starts at this pixel address)
// WRAP        (5): wrap_pixel_address[23:0]   (restart at pixel address 0 when reached)
// READREGID   (6): mapped_regid[2:0]          (the register mapped for read access)
//
// Commands:
// SET_PALETTE_R (1)  arg12_1: cmap entry  arg12_2: R
// SET_PALETTE_G (2)  arg12_1: cmap entry  arg12_2: G
// SET_PALETTE_B (3)  arg12_1: cmap entry  arg12_2: B
// SET_WWINDOW_X (4)  arg12_1: x1 arg12_2: x2
// SET_WWINDOW_Y (5)  arg12_1: y1 arg12_2: y2
// FILLRECT      (6)  arg24: color
//
// The window [x1-x2] [y1-y2] can be used in two different ways:
//   - FILLRECT fills it with the specified color. Operation is
//     complete when membusy goes low in REG_STATUS.
//   - individual pixel values can be specified one by one by 
//     writing to the DAT mapped IO (io_wstrb + sel_dat), pixel
//     address is incremented automatically.
//     This allows emulation of SSD1331/SSD1351 "window write" 
//     command in the three modes for OLED-HDMI mirroring
//
// See FIRMWARE/LIBFEMTOGL/FGA.h, FGA.c and FGA_mode.c

// "Physical mode" sent to the HDMI (choose one of them)
// Note: > 640x480 may make timings fail
//`define MODE_640x480
`define MODE_800x600
//`define MODE_1024x768
//`define MODE_1280x1024

`include "GFX_hdmi.v"

module FGA(
    input wire         pclk, // board clock	   
    input wire 	       clk, // system clock
    input wire 	       sel, // if zero, writes are ignored
    input wire [3:0]   mem_wmask, // mem write mask and strobe
    input wire [16:0]  mem_address, // address in graphic memory (128K), word-aligned
    input wire [31:0]  mem_wdata, // data to be written

    output wire [3:0]  gpdi_dp, // HDMI signals, blue, green, red, clock
                                  // dgpi_dn generated by pins (see ulx3s.lpf)

    input wire 	       io_wstrb,
    input wire 	       io_rstrb,
    input wire 	       sel_cntl, // IO: select control register (RW)
    input wire 	       sel_dat, // IO: select data input (W)
    output wire [31:0] rdata     // data read 
);

`include "GFX_modes.v"
   
   wire pixel_clk;
   
   reg [31:0] VRAM[0:32767];
   reg [23:0] PALETTE[0:255];
   
   /************************* HDMI signal generation ***************************/

   // Video mode parameters   
   localparam MODE_1bpp  = 3'd0;
   localparam MODE_2bpp  = 3'd1;
   localparam MODE_4bpp  = 3'd2;
   localparam MODE_8bpp  = 3'd3;
   localparam MODE_16bpp = 3'd4;
   
   reg [11:0]  mode_width;
   reg [11:0]  mode_height;
   reg [2:0]   mode_bpp;         // see MODE_xbpp constants
   reg         mode_colormapped; 
   reg         mode_magnify;     // asserted for pixel doubling
   reg [23:0]  mode_origin_pix_address;
   reg [23:0]  mode_wrap_pix_address;
   
   // This part is just like a VGA generator.
   reg  [11:0] X, Y;  // current pixel coordinates
   reg hsync, vsync;  // horizontal and vertical synchronization
   reg draw_area;      // asserted if current pixel is in drawing area
   reg mem_busy;      // asserted if memory transfer is running.

   // Data read from control register
   reg [31:0] read_reg;
   assign rdata = (io_rstrb && sel_cntl) ? read_reg : 32'b0;
   
   // We are going to fetch data from video RAM (now stored in BRAM), and then,
   // in colormapped modes, fetch colormap entry. Each fetch introduces some
   // latency -> there is a small pixel pipeline. Each stage needs to have
   // its own copy of all registers it needs (that is, copy pixel address
   // between stage 1 and stage 2 to keep it in sync with pixel data).
   //
   // Stage 0 generates the X,Y coordinates and horizontal,vertical sync signals
   //         (standard in all VGA/DVI/HDMI drivers)
   // Stage 1 generates the pixel address. The unit is in number of pixels.
   //         it handles pixel doubling/scanline doubling in 320x200 resolutions
   //         it also handles page flipping, with the ORIGIN register.
   // Stage 2 fetches pixel data from RAM. It handles pixel address -> word address
   //         translation. It creates its own copy of pixel_address to keep it in
   //         sync with pixel data (1 clock latency)
   // Stage 3 generates R,G,B either from colormap lookup (mode 1 and 2) or from
   //         16 bit pixel data directly (mode 0). If colormap lookup is used,
   //         it generates an additional cycle of latency.
   //
   // Note: the first two pixel columns are wrong due to latency (the image is
   // shifted two pixels to the right, with garbage in the first two columns),
   // normally we should start fetching from the previous scanline, at the end
   // of hsync, 1 clock in advance in mode 0, and two clocks in advance in mode 1. 
   // I was too lazy to do that, so I just hide the first two columns ! 
   // (so there are two columns missing on the right side of the image).
   // I will do that properly when VRAM will be stored in SDRAM (then I'll have no
   // choice, latency will probably be significantly larger than 2 pixels).
   
   // Stage 0: X,Y,vsync,hsync generation
   always @(posedge pixel_clk) begin
      if(X == GFX_line_width-1) begin
	 X <= 0;
	 Y <= (Y == GFX_lines-1) ? 0 : Y+1;
      end else begin
	 X <= X+1;
      end
      hsync <= (X>=GFX_width+GFX_h_front_porch) && 
		(X<GFX_width+GFX_h_front_porch+GFX_h_sync_width);
      vsync <= (Y>=GFX_height+GFX_v_front_porch) && 
		(Y<GFX_height+GFX_v_front_porch+GFX_v_sync_width);
      draw_area <= (X<GFX_width) && (Y<GFX_height);
   end
   
   // Stage 1: pixel address generation
   reg  [23:0] pix_address;
   reg  [23:0] row_start_pix_address;

   wire [23:0] next_row_start_pix_address = 
	       ((row_start_pix_address + {12'b0, mode_width}) <= mode_wrap_pix_address) ? 
                 row_start_pix_address + {12'b0, mode_width} : 0 ;
    
   // Generate pixel address based on scanning coordinates (X,Y) and
   // magnify mode (that doubles the rows and doubles the pixels in
   // the rows).
   always @(posedge pixel_clk) begin
      if(X == 0) begin
	 if(Y == 0) begin
	    row_start_pix_address <= mode_origin_pix_address;
	    pix_address           <= mode_origin_pix_address;
	 end else begin
	    // Increment row address every 2 Y (2 because magnify)
	    if(Y[0] || !mode_magnify) begin
	       row_start_pix_address <= next_row_start_pix_address;
	       pix_address           <= next_row_start_pix_address;
	    end else begin
	       pix_address <= row_start_pix_address;	       
	    end
	 end
      end else begin 
	 if(X[0] || !mode_magnify) pix_address <= pix_address + 1;
      end
   end 

   // Stage 2: pixel data fetch
   reg [23:0] word_address;
   always @(*) begin
      case(mode_bpp) 
	MODE_16bpp: word_address = pix_address >> 1;
	MODE_8bpp:  word_address = pix_address >> 2;
	MODE_4bpp:  word_address = pix_address >> 3;
	MODE_2bpp:  word_address = pix_address >> 4;
	MODE_1bpp:  word_address = pix_address >> 5;
	default:    word_address = 0;
      endcase
   end
   reg [23:0] pix_address_2;
   reg [31:0] pix_word_data_2;
   always @(posedge pixel_clk) begin
      pix_address_2 <= pix_address;
      pix_word_data_2 <= VRAM[word_address[14:0]]; // TODO 
   end

   // Stage 3: generate R,G,B from pixel data

   // combinatorial circuit to extract index from
   // pixel data.
   reg [7:0] pix_color_index_3;
   /* verilator lint_off WIDTH */   
   always @(*) begin
      case(mode_bpp)
	MODE_8bpp: begin
	   pix_color_index_3      = pix_word_data_2 >> {pix_address_2[1:0], 3'b0};
	end
	MODE_4bpp: begin
	   pix_color_index_3[3:0] = pix_word_data_2 >> {pix_address_2[2:0], 2'b0};
	   pix_color_index_3[7:4] = 4'b0;
	end
	MODE_2bpp: begin
	   pix_color_index_3[1:0] = pix_word_data_2 >> {pix_address_2[3:0], 1'b0};
	   pix_color_index_3[7:2] = 6'b0;
	end
	MODE_1bpp: begin
	   pix_color_index_3[0]   = pix_word_data_2 >> pix_address_2[4:0];
	   pix_color_index_3[7:1] = 7'b0;	   
	end
	default: begin
	   pix_color_index_3      = 0;
	end
      endcase
   end 
   /* verilator lint_on WIDTH */      

   reg [11:0] maxX;
   reg [11:0] maxY;

   always @(posedge clk) begin
      maxX <= mode_magnify ? (mode_width  << 1) : mode_width;
      maxY <= mode_magnify ? (mode_height << 1) : mode_height;
   end
   
   reg [7:0]  R,G,B;
   always @(posedge pixel_clk) begin
      if(mode_colormapped) begin
	 {R,G,B} <= PALETTE[pix_color_index_3];
      end else begin
	 if(pix_address_2[0]) begin 
	    R <= {pix_word_data_2[31:27],3'b000};
	    G <= {pix_word_data_2[26:21],2'b00 };
	    B <= {pix_word_data_2[20:16],3'b000};
	 end else begin
	    R <= {pix_word_data_2[15:11],3'b000};
	    G <= {pix_word_data_2[10:5 ],2'b00 };
	    B <= {pix_word_data_2[ 4:0 ],3'b000};
	 end
      end
      // Hide what's outside the display zone.
      // Hide the first two columns (I was too lazy to properly handle my
      //  pixel pipeline latency).
      if(X == 0 || X == 1 || X >= maxX || Y >= maxY) {R,G,B} <= 24'b0;
   end

   // Video signal generation and HDMI

   wire pixel_clk_x5; // The pixel_clk*5 freq clock used by the serializers (DDR)

   // The graphic PLL, that generates the pixel clock (and freq*5 clock)
   GFX_PLL gfx_pll(
     .pclk(pclk),
     .pixel_clk(pixel_clk),
     .pixel_clk_x5(pixel_clk_x5)	   
   );

   // The HDMI encoder
   GFX_hdmi hdmi(
     .pixel_clk(pixel_clk), .pixel_clk_x5(pixel_clk_x5),
     .R(R), .G(G), .B(B), .hsync(hsync), .vsync(vsync), .draw_area(draw_area),
     .gpdi_dp(gpdi_dp)		 
   );
   
   /*************************************************************************/
   
   wire       is_command = mem_wdata[7];
   wire [2:0] command = mem_wdata[2:0];
   wire [2:0] set_regid = mem_wdata[2:0];   
   wire[23:0] arg24   = mem_wdata[31:8];  
   wire[11:0] arg12_1 = mem_wdata[19:8];  
   wire[11:0] arg12_2 = mem_wdata[31:20];

   localparam REG_STATUS      = 3'd0;
   localparam REG_RESOLUTION  = 3'd1;
   localparam REG_COLORMODE   = 3'd2;
   localparam REG_DISPLAYMODE = 3'd3;   
   localparam REG_ORIGIN      = 3'd4;
   localparam REG_WRAP        = 3'd5;
   localparam REG_READREGID   = 3'd6;
   
   localparam CMD_SET_PALETTE_R = 3'd1;
   localparam CMD_SET_PALETTE_G = 3'd2;
   localparam CMD_SET_PALETTE_B = 3'd3;
   localparam CMD_SET_WWINDOW_X = 3'd4;
   localparam CMD_SET_WWINDOW_Y = 3'd5;
   localparam CMD_FILLRECT      = 3'd6;
   
   // Windowed-pixel write and fillrect command.
   //
   // - write window command, two commands:
   //     (send 32 bits to IO_FGA_CNTL hardware register)
   //   SET_WWINDOW_X: X1 X2
   //   SET_WWINDOW_Y: Y1 Y2
   //
   // - write data: send 16 bits to IO_FGA_DAT hardware register
   //    MSB first, encoding follows SSD1351: RRRRR GGGGG 0 BBBBB
   //
   //  Note that once the window is properly initialized, the write
   // data command emulates the SSD1351 OLED display, then by writing
   // to both FGA and SSD1351 control registers, one clones the output
   // of the SSD1351 oled display to the HDMI screen for free !
   //
   // See in <femtorv32.h>:
   // #define IO_GFX_DAT (IO_SSD1351_DAT16 | IO_FGA_DAT)
   // #define OLED_WRITE_DATA_UINT16(RGB) IO_OUT(IO_GFX_DAT,(RGB)) 
   // #define OLED_WRITE_DATA_RGB(R,G,B)  OLED_WRITE_DATA_UINT16(GL_RGB(R,G,B))
   //
   // This also works when FGA is in paletted mode (320x200x8bpp, 640x400x4bpp)
   // since the write data command properly interprets pixel addresses. The
   // only requirement is to have a palette that will correctly map the 8 LSBs
   // / 4 LSBs of pixel data to a color. In libfemtorv32, this maps 0 to black
   // and any non-zero to white (this is how COMMANDER is displayed in 640x400
   // on the HDMI screen).
   //
   // To generate pixel data, there are two other options:
   //   - directly writing to VRAM from FemtoRV32
   //   - FILLRECT (see below)
   
   
   reg [11:0] window_x1, window_x2, window_y1, window_y2, window_x, window_y;
   reg [23:0] window_row_start;
   reg [23:0] window_pixel_address;
   reg [15:0] fill_color;
   reg        fill_rect;

   // Data read from control register: depends on mapped register (read_regid)
   reg [2:0]  read_regid;
   always @(posedge clk) begin
      case(read_regid)
	REG_RESOLUTION:  read_reg <= {8'b0, mode_height, mode_width};
	REG_COLORMODE:   read_reg <= {28'b0, mode_colormapped, mode_bpp};
	REG_DISPLAYMODE: read_reg <= {31'b0, mode_magnify};	
	REG_ORIGIN:      read_reg <= {8'b0, mode_origin_pix_address};
	REG_WRAP:        read_reg <= {8'b0, mode_wrap_pix_address};
	REG_READREGID:   read_reg <= {29'b0, read_regid};	
	default:         read_reg <= {(Y >= 400),(X >= 640),draw_area,mem_busy,4'b0,X,Y};
      endcase 
   end 
   
   always @(posedge clk) begin
      if(mem_busy && ((io_wstrb && sel_dat) || fill_rect)) begin
	 window_pixel_address <= window_pixel_address + 1;
	 window_x             <= window_x + 1;	    
	 if(window_x == window_x2) begin
	    if(window_y == window_y2) begin
	       mem_busy  <= 1'b0;
	       fill_rect <= 1'b0;
	    end else begin
	       window_y <= window_y+1;
	       window_x <= window_x1;
	       window_pixel_address <= window_row_start + {12'b0, mode_width};
	       window_row_start     <= window_row_start + {12'b0, mode_width};
	    end
	 end 
      end

      if(io_wstrb && sel_cntl) begin
	 if(is_command) begin
	    case(command)
	      CMD_SET_PALETTE_B: PALETTE[arg12_1[7:0]][7:0 ]  <= arg12_2[7:0];
	      CMD_SET_PALETTE_G: PALETTE[arg12_1[7:0]][15:8]  <= arg12_2[7:0];
	      CMD_SET_PALETTE_R: PALETTE[arg12_1[7:0]][23:16] <= arg12_2[7:0];
	      CMD_SET_WWINDOW_X: begin
		 window_x1 <= arg12_1;
		 window_x2 <= arg12_2;
		 window_x  <= arg12_1;
		 mem_busy  <= 1'b1;
	      end
	      CMD_SET_WWINDOW_Y: begin
		 window_y1 <= arg12_1;
		 window_y2 <= arg12_2;
		 window_y  <= arg12_1;
		 mem_busy  <= 1'b1;
		 /* verilator lint_off WIDTH */
		 window_row_start     <= arg12_1 * mode_width + window_x1;
		 window_pixel_address <= arg12_1 * mode_width + window_x1;
		 /* verilator lint_on WIDTH */		 
	      end
	      CMD_FILLRECT: begin
		 fill_rect  <= 1'b1;
		 fill_color <= arg24[15:0];
	      end
	      default: begin end
	    endcase
	 end else begin 
	    case(set_regid)
	      REG_RESOLUTION:  {mode_height, mode_width}    <= arg24;
	      REG_COLORMODE:   {mode_colormapped, mode_bpp} <= arg24[3:0];
	      REG_DISPLAYMODE: mode_magnify                 <= arg24[0];
	      REG_READREGID:   read_regid                   <= arg24[2:0];
	      REG_ORIGIN:      mode_origin_pix_address      <= arg24;
	      REG_WRAP:        mode_wrap_pix_address        <= arg24;
	      default: begin end
	    endcase
	 end
      end 
   end

   // Write to VRAM (FILLRECT and interface with processor)
   wire [14:0] vram_word_address = mem_address[16:2];
   wire [15:0] pixel_color = fill_rect ? fill_color : mem_wdata[15:0];

   // FILLRECT:
   // The fillrect command repeatedly sends the same pixel data to the current
   // window. It has two advantages as compared to do that by hand:
   //   - fills one pixel per clock (whereas in its fastest configuration,
   //     FemtoRV32 uses 6 clocks per loop iteration)
   //   - execution can continue, which lets FemtoRV prepare the next drawing
   //     operation. Before sending more data to FGA, FemtoRV needs to test
   //     the FGA_BUSY_bit in the control register, as follows:
   //         while(IO_IN(IO_FGA_CNTL) & FGA_BUSY_bit);
   // This is used in LIBFEMTORV32/FGA.c, to implement hardware-accelerated 
   // polygon fill (using one FILLRECT call per polygon scanline).
   
   always @(posedge clk) begin
      // FILLRECT or pixel data sent to the graphic data port
      if(fill_rect || (io_wstrb && sel_dat && mem_busy)) begin
	 /* verilator lint_off CASEINCOMPLETE */	 
	 case(mode_bpp)
	   MODE_16bpp: begin
	      case(window_pixel_address[0])
	        1'b0: VRAM[window_pixel_address[15:1]][15:0 ] <= pixel_color;
	        1'b1: VRAM[window_pixel_address[15:1]][31:16] <= pixel_color;
	      endcase
	   end
	   MODE_8bpp: begin
	      case(window_pixel_address[1:0])
                2'b00: VRAM[window_pixel_address[16:2]][ 7:0 ] <= pixel_color[7:0];
                2'b01: VRAM[window_pixel_address[16:2]][15:8 ] <= pixel_color[7:0];
                2'b10: VRAM[window_pixel_address[16:2]][23:16] <= pixel_color[7:0];
                2'b11: VRAM[window_pixel_address[16:2]][31:24] <= pixel_color[7:0];		  
	      endcase
	   end
	   MODE_4bpp: begin
	      case(window_pixel_address[2:0])
                3'b000: VRAM[window_pixel_address[17:3]][ 3:0 ] <= pixel_color[3:0];
                3'b001: VRAM[window_pixel_address[17:3]][ 7:4 ] <= pixel_color[3:0];
                3'b010: VRAM[window_pixel_address[17:3]][11:8 ] <= pixel_color[3:0];
                3'b011: VRAM[window_pixel_address[17:3]][15:12] <= pixel_color[3:0];
                3'b100: VRAM[window_pixel_address[17:3]][19:16] <= pixel_color[3:0];
                3'b101: VRAM[window_pixel_address[17:3]][23:20] <= pixel_color[3:0];
                3'b110: VRAM[window_pixel_address[17:3]][27:24] <= pixel_color[3:0];
                3'b111: VRAM[window_pixel_address[17:3]][31:28] <= pixel_color[3:0];		   		   
	      endcase
	   end 
	   MODE_2bpp: begin
	      case(window_pixel_address[3:0])
                4'b0000: VRAM[window_pixel_address[18:4]][ 1:0 ] <= pixel_color[1:0];
                4'b0001: VRAM[window_pixel_address[18:4]][ 3:2 ] <= pixel_color[1:0];
                4'b0010: VRAM[window_pixel_address[18:4]][ 5:4 ] <= pixel_color[1:0];
                4'b0011: VRAM[window_pixel_address[18:4]][ 7:6 ] <= pixel_color[1:0];
                4'b0100: VRAM[window_pixel_address[18:4]][ 9:8 ] <= pixel_color[1:0];
                4'b0101: VRAM[window_pixel_address[18:4]][11:10] <= pixel_color[1:0];
                4'b0110: VRAM[window_pixel_address[18:4]][13:12] <= pixel_color[1:0];
                4'b0111: VRAM[window_pixel_address[18:4]][15:14] <= pixel_color[1:0];
                4'b1000: VRAM[window_pixel_address[18:4]][17:16] <= pixel_color[1:0];		
                4'b1001: VRAM[window_pixel_address[18:4]][19:18] <= pixel_color[1:0];
                4'b1010: VRAM[window_pixel_address[18:4]][21:20] <= pixel_color[1:0];
                4'b1011: VRAM[window_pixel_address[18:4]][23:22] <= pixel_color[1:0];
                4'b1100: VRAM[window_pixel_address[18:4]][25:24] <= pixel_color[1:0];
                4'b1101: VRAM[window_pixel_address[18:4]][27:26] <= pixel_color[1:0];
                4'b1110: VRAM[window_pixel_address[18:4]][29:28] <= pixel_color[1:0];
                4'b1111: VRAM[window_pixel_address[18:4]][31:30] <= pixel_color[1:0];		   		   
	      endcase
	   end 
	   default: begin // 1bpp
	      VRAM[window_pixel_address[19:5]][window_pixel_address[4:0]] <= pixel_color[0];		   		   	      
	   end
	 endcase 
	 /* verilator lint_on CASEINCOMPLETE */	 	 
      end else if(sel && !mem_busy) begin // Direct VRAM write from FemtoRV32
	 if(mem_wmask[0]) VRAM[vram_word_address][ 7:0 ] <= mem_wdata[ 7:0 ];
	 if(mem_wmask[1]) VRAM[vram_word_address][15:8 ] <= mem_wdata[15:8 ];
	 if(mem_wmask[2]) VRAM[vram_word_address][23:16] <= mem_wdata[23:16];
	 if(mem_wmask[3]) VRAM[vram_word_address][31:24] <= mem_wdata[31:24];	 
      end 
   end
   
endmodule
	   
